{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LAB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-10-07T11:57:20.273641Z",
     "start_time": "2023-10-07T11:57:19.803590Z"
    }
   },
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "\n`load_boston` has been removed from scikit-learn since version 1.2.\n\nThe Boston housing prices dataset has an ethical problem: as\ninvestigated in [1], the authors of this dataset engineered a\nnon-invertible variable \"B\" assuming that racial self-segregation had a\npositive impact on house prices [2]. Furthermore the goal of the\nresearch that led to the creation of this dataset was to study the\nimpact of air quality but it did not give adequate demonstration of the\nvalidity of this assumption.\n\nThe scikit-learn maintainers therefore strongly discourage the use of\nthis dataset unless the purpose of the code is to study and educate\nabout ethical issues in data science and machine learning.\n\nIn this special case, you can fetch the dataset from the original\nsource::\n\n    import pandas as pd\n    import numpy as np\n\n    data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n    raw_df = pd.read_csv(data_url, sep=\"\\s+\", skiprows=22, header=None)\n    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n    target = raw_df.values[1::2, 2]\n\nAlternative datasets include the California housing dataset and the\nAmes housing dataset. You can load the datasets as follows::\n\n    from sklearn.datasets import fetch_california_housing\n    housing = fetch_california_housing()\n\nfor the California housing dataset and::\n\n    from sklearn.datasets import fetch_openml\n    housing = fetch_openml(name=\"house_prices\", as_frame=True)\n\nfor the Ames housing dataset.\n\n[1] M Carlisle.\n\"Racist data destruction?\"\n<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>\n\n[2] Harrison Jr, David, and Daniel L. Rubinfeld.\n\"Hedonic housing prices and the demand for clean air.\"\nJournal of environmental economics and management 5.1 (1978): 81-102.\n<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>\n",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mImportError\u001B[0m                               Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[1], line 4\u001B[0m\n\u001B[1;32m      1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mpandas\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mpd\u001B[39;00m\n\u001B[1;32m      2\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mnumpy\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mnp\u001B[39;00m\n\u001B[0;32m----> 4\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msklearn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdatasets\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_boston\n\u001B[1;32m      5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01msklearn\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpreprocessing\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m PolynomialFeatures\n\u001B[1;32m      7\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mmatplotlib\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpyplot\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m \u001B[38;5;21;01mplt\u001B[39;00m\n",
      "File \u001B[0;32m/opt/anaconda3/envs/py309/lib/python3.9/site-packages/sklearn/datasets/__init__.py:157\u001B[0m, in \u001B[0;36m__getattr__\u001B[0;34m(name)\u001B[0m\n\u001B[1;32m    108\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m name \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mload_boston\u001B[39m\u001B[38;5;124m\"\u001B[39m:\n\u001B[1;32m    109\u001B[0m     msg \u001B[38;5;241m=\u001B[39m textwrap\u001B[38;5;241m.\u001B[39mdedent(\u001B[38;5;124m\"\"\"\u001B[39m\n\u001B[1;32m    110\u001B[0m \u001B[38;5;124m        `load_boston` has been removed from scikit-learn since version 1.2.\u001B[39m\n\u001B[1;32m    111\u001B[0m \n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m    155\u001B[0m \u001B[38;5;124m        <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>\u001B[39m\n\u001B[1;32m    156\u001B[0m \u001B[38;5;124m        \u001B[39m\u001B[38;5;124m\"\"\"\u001B[39m)\n\u001B[0;32m--> 157\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mImportError\u001B[39;00m(msg)\n\u001B[1;32m    158\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m    159\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mglobals\u001B[39m()[name]\n",
      "\u001B[0;31mImportError\u001B[0m: \n`load_boston` has been removed from scikit-learn since version 1.2.\n\nThe Boston housing prices dataset has an ethical problem: as\ninvestigated in [1], the authors of this dataset engineered a\nnon-invertible variable \"B\" assuming that racial self-segregation had a\npositive impact on house prices [2]. Furthermore the goal of the\nresearch that led to the creation of this dataset was to study the\nimpact of air quality but it did not give adequate demonstration of the\nvalidity of this assumption.\n\nThe scikit-learn maintainers therefore strongly discourage the use of\nthis dataset unless the purpose of the code is to study and educate\nabout ethical issues in data science and machine learning.\n\nIn this special case, you can fetch the dataset from the original\nsource::\n\n    import pandas as pd\n    import numpy as np\n\n    data_url = \"http://lib.stat.cmu.edu/datasets/boston\"\n    raw_df = pd.read_csv(data_url, sep=\"\\s+\", skiprows=22, header=None)\n    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])\n    target = raw_df.values[1::2, 2]\n\nAlternative datasets include the California housing dataset and the\nAmes housing dataset. You can load the datasets as follows::\n\n    from sklearn.datasets import fetch_california_housing\n    housing = fetch_california_housing()\n\nfor the California housing dataset and::\n\n    from sklearn.datasets import fetch_openml\n    housing = fetch_openml(name=\"house_prices\", as_frame=True)\n\nfor the Ames housing dataset.\n\n[1] M Carlisle.\n\"Racist data destruction?\"\n<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>\n\n[2] Harrison Jr, David, and Daniel L. Rubinfeld.\n\"Hedonic housing prices and the demand for clean air.\"\nJournal of environmental economics and management 5.1 (1978): 81-102.\n<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.datasets import load_boston\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "import statsmodels.formula.api as smf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-10-07T11:57:20.283272Z",
     "start_time": "2023-10-07T11:57:20.274875Z"
    }
   },
   "outputs": [],
   "source": [
    "boston = load_boston()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.276280Z"
    }
   },
   "outputs": [],
   "source": [
    "data = pd.DataFrame(boston.data,columns = boston['feature_names'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.277225Z"
    }
   },
   "outputs": [],
   "source": [
    "data['MEDV'] = boston['target']\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.279229Z"
    }
   },
   "outputs": [],
   "source": [
    "sns.regplot(data['LSTAT'],data['MEDV'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.280322Z"
    }
   },
   "outputs": [],
   "source": [
    "# est = smf.ols(y ~ x,data)\n",
    "est = smf.ols('MEDV ~ LSTAT',data = data).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.281532Z"
    }
   },
   "outputs": [],
   "source": [
    "#dist of residuals\n",
    "plt.ylim(-25,25)\n",
    "sns.scatterplot(est.fittedvalues,est.resid)\n",
    "plt.axhline(y = 0,linewidth = 0.5,linestyle = 'dashed',color = 'black')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Multiple Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.282854Z"
    }
   },
   "outputs": [],
   "source": [
    "string_cols = ' + '.join(data.columns[:-1])\n",
    "est = smf.ols('MEDV ~ {}'.format(string_cols),data = data).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.284574Z"
    }
   },
   "outputs": [],
   "source": [
    "#removing AGE and INDUS\n",
    "string_cols = ' + '.join(data.columns[:-1].difference(['AGE','INDUS']))\n",
    "est = smf.ols('MEDV ~ {}'.format(string_cols),data = data).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Interaction term"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.286015Z"
    }
   },
   "outputs": [],
   "source": [
    "est = smf.ols('MEDV ~ LSTAT*AGE',data = data).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.287554Z"
    }
   },
   "outputs": [],
   "source": [
    "#adding power term\n",
    "tmp = data.copy()\n",
    "tmp['LSTAT_2'] = tmp['LSTAT']**2\n",
    "est = smf.ols('MEDV ~ LSTAT + LSTAT_2',data = tmp).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.288469Z"
    }
   },
   "outputs": [],
   "source": [
    "#adding power term\n",
    "# tmp = data.copy()\n",
    "# tmp['LSTAT_2'] = tmp['LSTAT']**2\n",
    "est = smf.ols('MEDV ~ np.power(LSTAT,5)',data = data).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Qualitative predictors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.289501Z"
    }
   },
   "outputs": [],
   "source": [
    "carseats = pd.read_csv(r'E:\\programming\\dataset\\Into_to_statstical_learning\\carseats.csv')\n",
    "print(carseats.shape)\n",
    "carseats.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-10-07T11:57:20.290860Z"
    }
   },
   "outputs": [],
   "source": [
    "est = smf.ols('Sales ~ ShelveLoc',data = carseats).fit()\n",
    "print(est.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  we can see that "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
